In [7]:
import pandas as pd
# dataset
#https://www.kaggle.com/sdolezel/black-friday

1. Ler csv¶

1.1 Leitura Seletiva¶

In [8]:
pd.set_option('display.max_columns', 12)
df = pd.read_csv('train.csv')
df.head(3)
Out[8]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
In [9]:
df_part = pd.read_csv('train.csv', nrows=5000)
df_part.shape
Out[9]:
(5000, 12)
In [10]:
df_cols = pd.read_csv('train.csv', usecols=['Age', 'City_Category', 'Occupation', 'Purchase'])
df_cols
Out[10]:
Age Occupation City_Category Purchase
0 0-17 10 A 8370
1 0-17 10 A 15200
2 0-17 10 A 1422
3 0-17 10 A 1057
4 55+ 16 C 7969
... ... ... ... ...
550063 51-55 13 B 368
550064 26-35 1 C 371
550065 26-35 15 B 137
550066 55+ 1 C 365
550067 46-50 0 B 490

550068 rows × 4 columns

1.2 Ler Vários Arquivos em um DataFrame Único - Por linha¶

In [11]:
df_1 = pd.read_csv('train.csv')
print(df_1.shape)
df_1.head()
(550068, 12)
Out[11]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
In [12]:
df_2 = pd.read_csv('test.csv')
print(df_2.shape)
df_2.head()
(233599, 11)
Out[12]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
0 1000004 P00128942 M 46-50 7 B 2 1 1 11.0 NaN
1 1000009 P00113442 M 26-35 17 C 0 0 3 5.0 NaN
2 1000010 P00288442 F 36-45 1 B 4+ 1 5 14.0 NaN
3 1000010 P00145342 F 36-45 1 B 4+ 1 4 9.0 NaN
4 1000011 P00053842 F 26-35 1 C 1 0 4 5.0 12.0
In [13]:
from glob import glob
In [14]:
files = sorted(glob('*.csv'))
files
Out[14]:
['test.csv', 'train.csv']
In [15]:
pd.concat((pd.read_csv(file) for file in files) , ignore_index = True)
Out[15]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000004 P00128942 M 46-50 7 B 2 1 1 11.0 NaN NaN
1 1000009 P00113442 M 26-35 17 C 0 0 3 5.0 NaN NaN
2 1000010 P00288442 F 36-45 1 B 4+ 1 5 14.0 NaN NaN
3 1000010 P00145342 F 36-45 1 B 4+ 1 4 9.0 NaN NaN
4 1000011 P00053842 F 26-35 1 C 1 0 4 5.0 12.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
783662 1006033 P00372445 M 51-55 13 B 1 1 20 NaN NaN 368.0
783663 1006035 P00375436 F 26-35 1 C 3 0 20 NaN NaN 371.0
783664 1006036 P00375436 F 26-35 15 B 4+ 1 20 NaN NaN 137.0
783665 1006038 P00375436 F 55+ 1 C 2 0 20 NaN NaN 365.0
783666 1006039 P00371644 F 46-50 0 B 4+ 1 20 NaN NaN 490.0

783667 rows × 12 columns

1.2 Ler Vários Arquivos em um DataFrame Único - Por coluna¶

In [16]:
from glob import glob
In [17]:
df_1 = pd.read_csv('train.csv')
df_2 = pd.read_csv('test.csv')
In [18]:
files = sorted(glob('*.csv'))
files
Out[18]:
['test.csv', 'train.csv']
In [19]:
pd.concat((pd.read_csv(file) for file in files) , axis = 'columns').head()
Out[19]:
User_ID Product_ID Gender Age Occupation City_Category ... Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000004.0 P00128942 M 46-50 7.0 B ... 2 0 3 NaN NaN 8370
1 1000009.0 P00113442 M 26-35 17.0 C ... 2 0 1 6.0 14.0 15200
2 1000010.0 P00288442 F 36-45 1.0 B ... 2 0 12 NaN NaN 1422
3 1000010.0 P00145342 F 36-45 1.0 B ... 2 0 12 14.0 NaN 1057
4 1000011.0 P00053842 F 26-35 1.0 C ... 4+ 0 8 NaN NaN 7969

5 rows × 23 columns

2. Renomear Colunas¶

In [20]:
df = pd.read_csv('train.csv')
df.head(2)
Out[20]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
In [21]:
new_col_names = {'Age': 'Idade','City_Category': 'Cidade' }
df = df.rename(new_col_names, axis = 'columns')
df.head()
Out[21]:
User_ID Product_ID Gender Idade Occupation Cidade Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
In [22]:
df.columns = df.columns.str.replace('_', '-')
df.head()
Out[22]:
User-ID Product-ID Gender Idade Occupation Cidade Stay-In-Current-City-Years Marital-Status Product-Category-1 Product-Category-2 Product-Category-3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969

Prefixo e Sufixo¶

In [23]:
df.add_prefix('A_')
Out[23]:
A_User-ID A_Product-ID A_Gender A_Idade A_Occupation A_Cidade A_Stay-In-Current-City-Years A_Marital-Status A_Product-Category-1 A_Product-Category-2 A_Product-Category-3 A_Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
... ... ... ... ... ... ... ... ... ... ... ... ...
550063 1006033 P00372445 M 51-55 13 B 1 1 20 NaN NaN 368
550064 1006035 P00375436 F 26-35 1 C 3 0 20 NaN NaN 371
550065 1006036 P00375436 F 26-35 15 B 4+ 1 20 NaN NaN 137
550066 1006038 P00375436 F 55+ 1 C 2 0 20 NaN NaN 365
550067 1006039 P00371644 F 46-50 0 B 4+ 1 20 NaN NaN 490

550068 rows × 12 columns

In [24]:
df.add_suffix('_B')
Out[24]:
User-ID_B Product-ID_B Gender_B Idade_B Occupation_B Cidade_B Stay-In-Current-City-Years_B Marital-Status_B Product-Category-1_B Product-Category-2_B Product-Category-3_B Purchase_B
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
... ... ... ... ... ... ... ... ... ... ... ... ...
550063 1006033 P00372445 M 51-55 13 B 1 1 20 NaN NaN 368
550064 1006035 P00375436 F 26-35 1 C 3 0 20 NaN NaN 371
550065 1006036 P00375436 F 26-35 15 B 4+ 1 20 NaN NaN 137
550066 1006038 P00375436 F 55+ 1 C 2 0 20 NaN NaN 365
550067 1006039 P00371644 F 46-50 0 B 4+ 1 20 NaN NaN 490

550068 rows × 12 columns

3 . Missing values¶

3.1 Identificar missing values¶

In [25]:
df.isna().sum()
Out[25]:
User-ID                            0
Product-ID                         0
Gender                             0
Idade                              0
Occupation                         0
Cidade                             0
Stay-In-Current-City-Years         0
Marital-Status                     0
Product-Category-1                 0
Product-Category-2            173638
Product-Category-3            383247
Purchase                           0
dtype: int64
In [26]:
df.isna().mean()
Out[26]:
User-ID                       0.000000
Product-ID                    0.000000
Gender                        0.000000
Idade                         0.000000
Occupation                    0.000000
Cidade                        0.000000
Stay-In-Current-City-Years    0.000000
Marital-Status                0.000000
Product-Category-1            0.000000
Product-Category-2            0.315666
Product-Category-3            0.696727
Purchase                      0.000000
dtype: float64

3.2 Preencher missing values¶

  • Função fill preenche os valores missing.
    • Podemos usar média, valor anterior ou valor posterior etc..
In [27]:
df = pd.read_csv('train.csv')
df.Product_Category_3.value_counts()
Out[27]:
16.0    32636
15.0    28013
14.0    18428
17.0    16702
5.0     16658
8.0     12562
9.0     11579
12.0     9246
13.0     5459
6.0      4890
18.0     4629
4.0      1875
11.0     1805
10.0     1726
3.0       613
Name: Product_Category_3, dtype: int64

Moda, Média, Mediana¶

In [28]:
# Moda
moda = df['Product_Category_3'].mode()[0]
media = df['Product_Category_3'].mean()
mediana = df['Product_Category_3'].median()

print('Moda:', moda)
print('Média:', media)
print('Mediana:', mediana)

df['Product_Category_3'].fillna(value=media, inplace=True)
df.Product_Category_3.value_counts()
Moda: 16.0
Média: 12.668243206790512
Mediana: 14.0
Out[28]:
12.668243    383247
16.000000     32636
15.000000     28013
14.000000     18428
17.000000     16702
5.000000      16658
8.000000      12562
9.000000      11579
12.000000      9246
13.000000      5459
6.000000       4890
18.000000      4629
4.000000       1875
11.000000      1805
10.000000      1726
3.000000        613
Name: Product_Category_3, dtype: int64

Pad, Backfill¶

In [29]:
df = pd.read_csv('train.csv')
df.head(10)
Out[29]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
5 1000003 P00193542 M 26-35 15 A 3 0 1 2.0 NaN 15227
6 1000004 P00184942 M 46-50 7 B 2 1 1 8.0 17.0 19215
7 1000004 P00346142 M 46-50 7 B 2 1 1 15.0 NaN 15854
8 1000004 P0097242 M 46-50 7 B 2 1 1 16.0 NaN 15686
9 1000005 P00274942 M 26-35 20 A 1 1 8 NaN NaN 7871
In [30]:
df = pd.read_csv('train.csv')
df['Product_Category_3'].fillna(method = "pad", inplace=True)
df.Product_Category_3.value_counts()
df.head(10)
Out[30]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN 14.0 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 14.0 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN 14.0 7969
5 1000003 P00193542 M 26-35 15 A 3 0 1 2.0 14.0 15227
6 1000004 P00184942 M 46-50 7 B 2 1 1 8.0 17.0 19215
7 1000004 P00346142 M 46-50 7 B 2 1 1 15.0 17.0 15854
8 1000004 P0097242 M 46-50 7 B 2 1 1 16.0 17.0 15686
9 1000005 P00274942 M 26-35 20 A 1 1 8 NaN 17.0 7871
In [31]:
df = pd.read_csv('train.csv')
df['Product_Category_2'].fillna(method = "backfill", inplace=True)
df.Product_Category_2.value_counts()
df.head(10)
Out[31]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 6.0 NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 14.0 NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 2.0 NaN 7969
5 1000003 P00193542 M 26-35 15 A 3 0 1 2.0 NaN 15227
6 1000004 P00184942 M 46-50 7 B 2 1 1 8.0 17.0 19215
7 1000004 P00346142 M 46-50 7 B 2 1 1 15.0 NaN 15854
8 1000004 P0097242 M 46-50 7 B 2 1 1 16.0 NaN 15686
9 1000005 P00274942 M 26-35 20 A 1 1 8 11.0 NaN 7871

3.3 Deletando Missing Values¶

In [32]:
df = pd.read_csv('train.csv')
df.isna().sum()
Out[32]:
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64
In [33]:
# axis = 0 deleta linhas com missing values
# axis = 1 deleta colunas com missing values
df = pd.read_csv('train.csv')
df.dropna(axis=0, thresh= 11, inplace=True)
In [34]:
df.isna().sum()
Out[34]:
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2                 0
Product_Category_3            209609
Purchase                           0
dtype: int64

4 Selecionar linhas por condições¶

Operadores¶

In [36]:
# Pode usar também:
# == para igual
# >= para maior igual 
# <= para menor ou igual
# & para and
# | para ou 
df = pd.read_csv('train.csv')
A_0_17 = df[(df.Age == '0-17') & (df.City_Category == 'A')]
A_0_17
Out[36]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
85 1000019 P00112542 M 0-17 10 A 3 0 1 11.0 15.0 7746
... ... ... ... ... ... ... ... ... ... ... ... ...
548970 1004478 P00372445 M 0-17 12 A 1 0 20 NaN NaN 246
548982 1004493 P00372445 F 0-17 10 A 1 0 20 NaN NaN 374
549222 1004807 P00372445 M 0-17 2 A 1 0 20 NaN NaN 359
549524 1005255 P00372445 M 0-17 10 A 3 0 20 NaN NaN 364
549562 1005302 P00372445 F 0-17 10 A 1 0 20 NaN NaN 139

2544 rows × 12 columns

4.1 Query¶

In [37]:
df = pd.read_csv('train.csv')
df = df.query('500 < Purchase < 800')
df
Out[37]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
80 1000018 P0094142 F 18-25 3 B 3 0 4 5.0 NaN 697
122 1000023 P00112342 M 36-45 0 B 3 1 13 16.0 NaN 584
147 1000028 P00084442 F 26-35 1 C 2 1 13 16.0 NaN 758
371 1000060 P00132042 M 51-55 1 C 1 1 13 16.0 NaN 742
744 1000140 P00084642 F 36-45 1 B 3 0 13 16.0 NaN 770
... ... ... ... ... ... ... ... ... ... ... ... ...
550023 1005972 P00371644 F 26-35 20 B 0 0 20 NaN NaN 598
550038 1005995 P00375436 F 36-45 1 C 2 1 20 NaN NaN 596
550039 1005996 P00371644 F 26-35 0 B 1 1 20 NaN NaN 595
550048 1006010 P00371644 M 36-45 0 C 1 0 20 NaN NaN 591
550061 1006029 P00372445 F 26-35 1 C 1 1 20 NaN NaN 599

5297 rows × 12 columns

4.2 isin¶

In [38]:
df = pd.read_csv('train.csv')
df = df[df['Stay_In_Current_City_Years'].isin(['4+', '3'])]
df
Out[38]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
5 1000003 P00193542 M 26-35 15 A 3 0 1 2.0 NaN 15227
19 1000008 P00249542 M 26-35 12 C 4+ 1 1 5.0 15.0 19614
20 1000008 P00220442 M 26-35 12 C 4+ 1 5 14.0 NaN 8584
21 1000008 P00156442 M 26-35 12 C 4+ 1 8 NaN NaN 9872
... ... ... ... ... ... ... ... ... ... ... ... ...
550056 1006022 P00375436 M 26-35 17 C 4+ 0 20 NaN NaN 254
550062 1006032 P00372445 M 46-50 7 A 3 0 20 NaN NaN 473
550064 1006035 P00375436 F 26-35 1 C 3 0 20 NaN NaN 371
550065 1006036 P00375436 F 26-35 15 B 4+ 1 20 NaN NaN 137
550067 1006039 P00371644 F 46-50 0 B 4+ 1 20 NaN NaN 490

180011 rows × 12 columns

4.3 ~ ou not¶

In [39]:
df = pd.read_csv('train.csv')
df = df[~df['Stay_In_Current_City_Years'].isin(['4+', '3'])]
df
Out[39]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
6 1000004 P00184942 M 46-50 7 B 2 1 1 8.0 17.0 19215
... ... ... ... ... ... ... ... ... ... ... ... ...
550059 1006025 P00370853 F 26-35 1 B 1 0 19 NaN NaN 48
550060 1006026 P00371644 M 36-45 6 C 1 1 20 NaN NaN 494
550061 1006029 P00372445 F 26-35 1 C 1 1 20 NaN NaN 599
550063 1006033 P00372445 M 51-55 13 B 1 1 20 NaN NaN 368
550066 1006038 P00375436 F 55+ 1 C 2 0 20 NaN NaN 365

370057 rows × 12 columns

5 Funções de Agregação¶

5.1 Varias funções de agregação com Groupby¶

In [40]:
df[['Age','Gender','Purchase']].groupby(['Age','Gender']).agg(['mean','count', 'sum', 'median']).head()
Out[40]:
Purchase
mean count sum median
Age Gender
0-17 F 8286.694085 3635 30122133 7800
M 9244.590496 7050 65174363 8113
18-25 F 8345.538682 17463 145738142 7647
M 9445.586493 50686 478758997 8124
26-35 F 8791.494875 32879 289055560 7909

5.1 Varias funções de agregação em diferentes grupos¶

In [41]:
df_dif_group = df[['Product_Category_1','Occupation','Purchase']].groupby('Occupation').agg({'Purchase':'mean', 'Product_Category_1':'sum'})
df_dif_group.head()
Out[41]:
Purchase Product_Category_1
Occupation
0 9173.296450 257553
1 8922.901463 181705
2 8773.293632 105368
3 9260.436527 65360
4 9216.389526 258181

6 Selecionar Colunas pelo tipo de dado¶

In [42]:
df.dtypes
Out[42]:
User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object
In [43]:
df.select_dtypes(include='number').head()
Out[43]:
User_ID Occupation Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 10 0 3 NaN NaN 8370
1 1000001 10 0 1 6.0 14.0 15200
2 1000001 10 0 12 NaN NaN 1422
3 1000001 10 0 12 14.0 NaN 1057
6 1000004 7 1 1 8.0 17.0 19215
In [44]:
df.select_dtypes(include='object').head()
Out[44]:
Product_ID Gender Age City_Category Stay_In_Current_City_Years
0 P00069042 F 0-17 A 2
1 P00248942 F 0-17 A 2
2 P00087842 F 0-17 A 2
3 P00085442 F 0-17 A 2
6 P00184942 M 46-50 B 2
In [45]:
df.select_dtypes(include=['number', 'object']).head()
Out[45]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
6 1000004 P00184942 M 46-50 7 B 2 1 1 8.0 17.0 19215
In [46]:
df.select_dtypes(exclude='number').head()
Out[46]:
Product_ID Gender Age City_Category Stay_In_Current_City_Years
0 P00069042 F 0-17 A 2
1 P00248942 F 0-17 A 2
2 P00087842 F 0-17 A 2
3 P00085442 F 0-17 A 2
6 P00184942 M 46-50 B 2

7. Converter strings em numéricos¶

In [47]:
df = pd.read_csv('train.csv')
df['Stay_In_Current_City_Years'].unique()
Out[47]:
array(['2', '4+', '3', '1', '0'], dtype=object)
In [48]:
df.dtypes
Out[48]:
User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object
In [49]:
#df.astype({'Stay-In-Current-City-Years': 'int'})
# converte os objects ou inputs inválidos na mudança para numerico em NaN
pd.to_numeric(df['Stay_In_Current_City_Years'], errors = 'coerce')
Out[49]:
0         2.0
1         2.0
2         2.0
3         2.0
4         NaN
         ... 
550063    1.0
550064    3.0
550065    NaN
550066    2.0
550067    NaN
Name: Stay_In_Current_City_Years, Length: 550068, dtype: float64
In [50]:
pd.to_numeric(df['Stay_In_Current_City_Years'], errors = 'coerce').fillna(4)
Out[50]:
0         2.0
1         2.0
2         2.0
3         2.0
4         4.0
         ... 
550063    1.0
550064    3.0
550065    4.0
550066    2.0
550067    4.0
Name: Stay_In_Current_City_Years, Length: 550068, dtype: float64
In [51]:
df['Stay_In_Current_City_Years']= pd.to_numeric(df['Stay_In_Current_City_Years'], errors = 'coerce').fillna(4)
df['Stay_In_Current_City_Years'].unique()
Out[51]:
array([2., 4., 3., 1., 0.])

8. Dividir o dataframe em 2 subsets aletóriamente¶

In [53]:
# tamanho do dataframe
len(df)
Out[53]:
550068
In [54]:
df_1 = df.sample(frac = 0.8 , random_state = 10)
In [55]:
df_2 = df.drop(df_1.index)
In [56]:
len(df_1 ) + len(df_2)
Out[56]:
550068
In [57]:
df_1.index.sort_values()
Out[57]:
Int64Index([     0,      1,      3,      4,      5,      7,      8,      9,
                11,     12,
            ...
            550055, 550056, 550058, 550059, 550060, 550061, 550063, 550065,
            550066, 550067],
           dtype='int64', length=440054)
In [58]:
df_2.index.sort_values()
Out[58]:
Int64Index([     2,      6,     10,     13,     15,     16,     18,     19,
                21,     22,
            ...
            550020, 550032, 550033, 550042, 550044, 550049, 550053, 550057,
            550062, 550064],
           dtype='int64', length=110014)

9. Alterando o shape de um multiIndex dataframe¶

In [59]:
df = pd.read_csv('train.csv')
df.groupby(['Age','Gender']).Purchase.mean()
Out[59]:
Age    Gender
0-17   F         8338.771985
       M         9235.173670
18-25  F         8343.180201
       M         9440.942971
26-35  F         8728.251754
       M         9410.337578
36-45  F         8959.844056
       M         9453.193643
46-50  F         8842.098947
       M         9357.471509
51-55  F         9042.449666
       M         9705.094802
55+    F         9007.036199
       M         9438.195603
Name: Purchase, dtype: float64
In [60]:
df.groupby(['Age','Gender']).Purchase.mean().unstack()
Out[60]:
Gender F M
Age
0-17 8338.771985 9235.173670
18-25 8343.180201 9440.942971
26-35 8728.251754 9410.337578
36-45 8959.844056 9453.193643
46-50 8842.098947 9357.471509
51-55 9042.449666 9705.094802
55+ 9007.036199 9438.195603

10. Pivot Table¶

In [61]:
df.pivot_table(index = 'Age', columns = 'Gender', values = 'Purchase', aggfunc = 'mean')
Out[61]:
Gender F M
Age
0-17 8338.771985 9235.173670
18-25 8343.180201 9440.942971
26-35 8728.251754 9410.337578
36-45 8959.844056 9453.193643
46-50 8842.098947 9357.471509
51-55 9042.449666 9705.094802
55+ 9007.036199 9438.195603
In [62]:
df.pivot_table(index = 'Age', columns = 'Gender', values = 'Purchase', aggfunc = 'mean', margins = True)
Out[62]:
Gender F M All
Age
0-17 8338.771985 9235.173670 8933.464640
18-25 8343.180201 9440.942971 9169.663606
26-35 8728.251754 9410.337578 9252.690633
36-45 8959.844056 9453.193643 9331.350695
46-50 8842.098947 9357.471509 9208.625697
51-55 9042.449666 9705.094802 9534.808031
55+ 9007.036199 9438.195603 9336.280459
All 8734.565765 9437.526040 9263.968713
In [63]:
df.Purchase.mean()
Out[63]:
9263.968712959126
In [64]:
df[df.Age == '0-17']['Purchase'].mean()
Out[64]:
8933.464640444974

7. Value counts com dados normalizados¶

In [65]:
df.Age.value_counts()
Out[65]:
26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
Name: Age, dtype: int64
In [66]:
df.Age.value_counts(normalize = True)
Out[66]:
26-35    0.399200
36-45    0.199999
18-25    0.181178
46-50    0.083082
51-55    0.069993
55+      0.039093
0-17     0.027455
Name: Age, dtype: float64

9. Selecionar range de linhas e colunas¶

9.1 Por index¶

In [67]:
df.iloc[:5, :6]
Out[67]:
User_ID Product_ID Gender Age Occupation City_Category
0 1000001 P00069042 F 0-17 10 A
1 1000001 P00248942 F 0-17 10 A
2 1000001 P00087842 F 0-17 10 A
3 1000001 P00085442 F 0-17 10 A
4 1000002 P00285442 M 55+ 16 C

Por labels das colunas¶

In [68]:
df.loc[ : , ['Age', 'Gender', 'Purchase']]
Out[68]:
Age Gender Purchase
0 0-17 F 8370
1 0-17 F 15200
2 0-17 F 1422
3 0-17 F 1057
4 55+ M 7969
... ... ... ...
550063 51-55 M 368
550064 26-35 F 371
550065 26-35 F 137
550066 55+ F 365
550067 46-50 F 490

550068 rows × 3 columns

10. Substituindo valores em colunas¶

In [69]:
df = pd.read_csv('train.csv')
df.head()
Out[69]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4+ 0 8 NaN NaN 7969
In [71]:
df['Stay_In_Current_City_Years'].unique()
Out[71]:
array(['2', '4+', '3', '1', '0'], dtype=object)
In [72]:
Stay_In_Current_City_Years_values = {'0': 0, '1': 1, '2':2, '3':3 ,'4+':4}
In [73]:
df['Stay_In_Current_City_Years'].replace(Stay_In_Current_City_Years_values, inplace=True)
In [74]:
df.head()
Out[74]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4 0 8 NaN NaN 7969

Bonus - 1¶

Converter numéros contínuos em categorical¶

In [75]:
df.head()
Out[75]:
User_ID Product_ID Gender Age Occupation City_Category Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase
0 1000001 P00069042 F 0-17 10 A 2 0 3 NaN NaN 8370
1 1000001 P00248942 F 0-17 10 A 2 0 1 6.0 14.0 15200
2 1000001 P00087842 F 0-17 10 A 2 0 12 NaN NaN 1422
3 1000001 P00085442 F 0-17 10 A 2 0 12 14.0 NaN 1057
4 1000002 P00285442 M 55+ 16 C 4 0 8 NaN NaN 7969
In [76]:
print('min:', df.Purchase.min())
print('mediana:', df.Purchase.median())
print('moda:', df.Purchase.mode()[0])
print('mean:', df.Purchase.mean())
print('max:', df.Purchase.max())
min: 12
mediana: 8047.0
moda: 7011
mean: 9263.968712959126
max: 23961
In [77]:
import matplotlib.pyplot as plt
df.Purchase.plot(kind='hist', title='Distribuição Vendas',
figsize=(10,6))

mean = df.Purchase.mean()
moda = df.Purchase.mode()[0]
mediana = df.Purchase.median()

plt.axvline(mean,color='b', linestyle='--')
plt.axvline(moda,color='green', linestyle='--')
plt.axvline(mediana,color='red', linestyle='--')

plt.legend({'Média':mean,'Moda':moda,'Mediana':mediana})
Out[77]:
<matplotlib.legend.Legend at 0x1f3bcbf94c0>
In [78]:
pd.cut(df.Purchase, bins = [0, 5000, 15000,25000], labels = ['Pouco', 'Médio','Satisfatório'])
Out[78]:
0                Médio
1         Satisfatório
2                Pouco
3                Pouco
4                Médio
              ...     
550063           Pouco
550064           Pouco
550065           Pouco
550066           Pouco
550067           Pouco
Name: Purchase, Length: 550068, dtype: category
Categories (3, object): ['Pouco' < 'Médio' < 'Satisfatório']
In [ ]:
df.shape
In [79]:
df['Purchase_labels'] = pd.cut(df.Purchase, bins = [0, 5000, 15000,25000], labels = ['Pouco', 'Médio','Satisfatório'])
df.head()
Out[79]:
User_ID Product_ID Gender Age Occupation City_Category ... Marital_Status Product_Category_1 Product_Category_2 Product_Category_3 Purchase Purchase_labels
0 1000001 P00069042 F 0-17 10 A ... 0 3 NaN NaN 8370 Médio
1 1000001 P00248942 F 0-17 10 A ... 0 1 6.0 14.0 15200 Satisfatório
2 1000001 P00087842 F 0-17 10 A ... 0 12 NaN NaN 1422 Pouco
3 1000001 P00085442 F 0-17 10 A ... 0 12 14.0 NaN 1057 Pouco
4 1000002 P00285442 M 55+ 16 C ... 0 8 NaN NaN 7969 Médio

5 rows × 13 columns

Bonus - 2 Relatório Estatístico rápido¶

Pandas Profiling - report¶

In [ ]:
#!pip install pandas-profiling
In [1]:
import pandas_profiling
In [6]:
df = pd.read_csv('train.csv')
pandas_profiling.ProfileReport(df)
HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=25.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…

HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…

Out[6]:

In [ ]: